#' ---
#' title: Multilingual Record Linkage Application
#' author: Joe Ornstein
#' date: 2025-06-20
#' version: 0.21
#' ---

library(tidyverse)
library(fuzzylink)

## Load ParlGov Dataset ------------------------------------

countries <- read_csv('raw/parlgov_countries.csv')

elections <- read_csv('raw/parlgov_elections.csv') |>
  # remove the English-speaking countries
  filter(!(country_name %in% c('Australia', 'New Zealand', 'Canada', 'United Kingdom', 'Ireland'))) |>
  # keep just the seats assigned to parties
  filter(!(party_name_english %in% c('one seat', 'one-seat', 'no seat', 'others', 'ethnic', 'no party affiliation'))) |>
  # just keep parliamentary elections (not European Parliament)
  filter(election_type == 'parliament') |>
  # keep only the parties that won seats
  filter(!is.na(seats), seats > 0) |>
  select(country_name, election_date, party_name_english, party_name, seats, left_right)

# split into two datasets
english_names <- elections |>
  select(country_name, election_date, party_name = party_name_english, left_right)

native_names <- elections |>
  select(country_name, election_date, party_name, seats)


## fuzzylink() -------------------------

model <- 'gpt-4o-2024-11-20'
embedding_model <- 'text-embedding-3-large'
fmla <- match ~ sim + jw

for(i in 1:nrow(countries)){

  print(i)
  cn <- countries$country_name[i]
  print(cn)

  d1 <- native_names |> filter(country_name == cn)
  d2 <- english_names |> filter(country_name == cn)

  system.time(df <- fuzzylink(d1, d2,
                              by = 'party_name',
                              blocking.variables = c('country_name', 'election_date'),
                              record_type = paste(countries$adjective[i],
                                                  'political party or coalition'),
                              instructions = "The first name will come from a list of political parties in the country's native language. The second name will come from a list of English translations. Misspellings, alternative names, and acronyms may be acceptable matches.",
                              model = model,
                              embedding_model = embedding_model,
                              fmla = fmla))

  # GPT-4o:
  # 7 minutes total and approximately $4.20 in API fees

  # open-mixtral-8x22b:
  # 80 minutes total; cost similar to GPT-4o

    save(df, file = paste0('data/parties-merge/', model, '/', deparse(fmla), '/', cn, '.RData'))
}


## Return all pairs to assess calibration of estimated match probabilities -----------

model <- 'gpt-4o-2024-11-20'# 'open-mixtral-8x22b'
embedding_model <- 'text-embedding-3-large'# 'mistral-embed'
fmla <- match ~ sim + jw

for(i in 1:nrow(countries)){

  print(i)
  cn <- countries$country_name[i]
  print(cn)

  d1 <- native_names |> filter(country_name == cn)
  d2 <- english_names |> filter(country_name == cn)

  system.time(df <- fuzzylink(d1, d2,
                              by = 'party_name',
                              blocking.variables = c('country_name', 'election_date'),
                              record_type = paste(countries$adjective[i],
                                                  'political party or coalition'),
                              instructions = "The first name will come from a list of political parties in the country's native language. The second name will come from a list of English translations. Misspellings, alternative names, and acronyms may be acceptable matches.",
                              model = model,
                              embedding_model = embedding_model,
                              fmla = fmla,
                              return_all_pairs = TRUE))

  save(df, file = paste0('data/parlgov-calibration/', model, '/', deparse(fmla), '/', cn, '.RData'))
}
